In [1]:

    
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder

Reading CSV



In [2]:

    
csv_df = pd.DataFrame.from_csv("marijuana-street-price-clean.csv",header=0,index_col=False,sep=',',parse_dates=[-1])



In [3]:

    
#csv_df.head



In [4]:

    
csv_df_sort = csv_df.sort(columns=['State','date'])



In [5]:

    
# let us count the number of entries in each column



In [6]:

    
csv_df_sort.count()









    Out[6]:





State     22899
HighQ     22899
HighQN    22899
MedQ      22899
MedQN     22899
LowQ      12342
LowQN     22899
date      22899
dtype: int64



In [7]:

    
csv_df_sort.dtypes









    Out[7]:





State             object
HighQ            float64
HighQN             int64
MedQ             float64
MedQN              int64
LowQ             float64
LowQN              int64
date      datetime64[ns]
dtype: object

Pre processing by filling data



In [8]:

    
# we see that LowQ does not match the number of entries and this a common problem in time series analysis
# lets fill up the Nan value with the last known best value so that we can work on continuing time series analysis



In [9]:

    
csv_df_sort_fillna_ffil = csv_df_sort.fillna(method='ffill')



In [10]:

    
csv_df_sort_fillna_ffil.count()









    Out[10]:





State     22899
HighQ     22899
HighQN    22899
MedQ      22899
MedQN     22899
LowQ      22899
LowQN     22899
date      22899
dtype: int64



In [11]:

    
#now we have cleaned data set to work with



In [12]:

    
global_mean_HighQ = csv_df_sort_fillna_ffil['HighQ'].mean()



In [13]:

    
global_mean_HighQ









    Out[13]:





329.75985414210226



In [14]:

    
#now that we have the global mean, we can find how far the values of mean are for each state from global mean



In [15]:

    
print type(csv_df_sort_fillna_ffil.head())









    



<class 'pandas.core.frame.DataFrame'>



In [16]:

    
#group_by_state.get_group(('Alabama'))
# to check values for a given group
# now lets use agg function to get generate Mean,Median,Standard deviation,Variance and covariance for every state



In [17]:

    
basic_measures_state_wise_df = pd.DataFrame()



In [18]:

    
#lets get all the unique states in the data frame and construct the measures for each state
# normal method
'''
unique_states = pd.unique(csv_df_sort_fillna_ffil['State'].ravel())
for state in unique_states:
    state_df = csv_df_sort_fillna_ffil.loc[csv_df_sort_fillna_ffil['State'] == state]
    highq_df = state_df.groupby(['State'])['HighQ'].agg([np.mean,np.median,np.std,np.var,np.cov])
    highq_df.head()
    lowq_df = state_df.groupby(['State'],)['LowQ'].agg([np.mean,np.median,np.std,np.var,np.cov])
    highq_df = highq_df.rename(columns={'mean':'HighQ_Mean','median':'HighQ_Median','std':'HighQ_Std','var':'HighQ_Var','cov':'HighQ_Cov'})
    lowq_df = lowq_df.rename(columns={'mean':'LowQ_Mean','median':'LowQ_Median','std':'LowQ_Std','var':'LowQ_Var','cov':'LowQ_Cov'})
    highq_df['State'] = state
    lowq_df['State'] = state
    break
'''









    Out[18]:





"\nunique_states = pd.unique(csv_df_sort_fillna_ffil['State'].ravel())\nfor state in unique_states:\n    state_df = csv_df_sort_fillna_ffil.loc[csv_df_sort_fillna_ffil['State'] == state]\n    highq_df = state_df.groupby(['State'])['HighQ'].agg([np.mean,np.median,np.std,np.var,np.cov])\n    highq_df.head()\n    lowq_df = state_df.groupby(['State'],)['LowQ'].agg([np.mean,np.median,np.std,np.var,np.cov])\n    highq_df = highq_df.rename(columns={'mean':'HighQ_Mean','median':'HighQ_Median','std':'HighQ_Std','var':'HighQ_Var','cov':'HighQ_Cov'})\n    lowq_df = lowq_df.rename(columns={'mean':'LowQ_Mean','median':'LowQ_Median','std':'LowQ_Std','var':'LowQ_Var','cov':'LowQ_Cov'})\n    highq_df['State'] = state\n    lowq_df['State'] = state\n    break\n"



In [19]:

    
csv_df_sort_fillna_ffil.head()

Generating statistical measures



In [20]:

    
statistics_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['State'],as_index=False).aggregate\
                             ({'HighQ': {'HighQ_Mean': np.mean,'HighQ_Median':np.median,'HighQ_Mode':stats.mode,'HighQ_Std':np.std,'HighQ_Var':np.var,'HighQ_Covar':np.cov},\
                               'LowQ':{'LowQ_Mean':np.mean,'LowQ_Median':np.median,'LowQ_Mode':stats.mode,'LowQ_Std':np.std,'LowQ_Var':np.var,'LowQ_Covar':np.cov}}))
print type(statistics_df)
statistics_df = statistics_df.rename(columns={'':'State'})
print type(statistics_df)
statistics_df.columns = statistics_df.columns.droplevel(0)
statistics_df.head()









    



<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>






    Out[20]:






  
    
      
      State
      HighQ_Std
      HighQ_Mode
      HighQ_Covar
      HighQ_Var
      HighQ_Mean
      HighQ_Median
      LowQ_Std
      LowQ_Covar
      LowQ_Median
      LowQ_Mode
      LowQ_Mean
      LowQ_Var
    
  
  
    
      0
      Alabama
      1.539724
      ([337.32], [10.0])
      2.370749
      2.370749
      339.561849
      340.10
      2.073186
      4.298099
      144.98
      ([144.98], [216.0])
      145.978508
      4.298099
    
    
      1
      Alaska
      5.580105
      ([285.65], [19.0])
      31.137567
      31.137567
      291.482004
      289.81
      13.188305
      173.931381
      403.33
      ([403.33], [237.0])
      394.653964
      173.931381
    
    
      2
      Arizona
      2.015750
      ([300.37], [10.0])
      4.063247
      4.063247
      300.667483
      300.46
      3.961371
      15.692461
      185.78
      ([185.78], [213.0])
      188.500134
      15.692461
    
    
      3
      Arkansas
      6.866002
      ([349.29], [11.0])
      47.141986
      47.141986
      348.056147
      348.21
      1.172681
      1.375181
      126.10
      ([126.1], [219.0])
      126.771269
      1.375181
    
    
      4
      California
      1.727046
      ([245.03], [8.0])
      2.982686
      2.982686
      245.376125
      245.31
      1.598252
      2.554409
      188.60
      ([188.6], [211.0])
      189.783586
      2.554409

Reading demographics and population data



In [21]:

    
#let us now read demographics data into a dataframe



In [22]:

    
demographics_df = pd.DataFrame.from_csv("Demographics_State.csv",header=0,index_col=False,sep=',')



In [23]:

    
demographics_df.head()









    Out[23]:






  
    
      
      region
      total_population
      percent_white
      percent_black
      percent_asian
      percent_hispanic
      per_capita_income
      median_rent
      median_age
    
  
  
    
      0
      alabama
      4799277
      67
      26
      1
      4
      23680
      501
      38.1
    
    
      1
      alaska
      720316
      63
      3
      5
      6
      32651
      978
      33.6
    
    
      2
      arizona
      6479703
      57
      4
      3
      30
      25358
      747
      36.3
    
    
      3
      arkansas
      2933369
      74
      15
      1
      7
      22170
      480
      37.5
    
    
      4
      california
      37659181
      40
      6
      13
      38
      29527
      1119
      35.4



In [24]:

    
demographics_df.shape









    Out[24]:





(51, 9)



In [25]:

    
demographics_df.columns









    Out[25]:





Index([u'region', u'total_population', u'percent_white', u'percent_black',
       u'percent_asian', u'percent_hispanic', u'per_capita_income',
       u'median_rent', u'median_age'],
      dtype='object')



In [26]:

    
#let us now read population data into a dataframe



In [27]:

    
population_df = pd.DataFrame.from_csv("Population_State.csv",header=0,index_col=False,sep=",")



In [28]:

    
population_df.head()



In [29]:

    
population_df.shape









    Out[29]:





(51, 2)



In [30]:

    
population_df.describe()









    Out[30]:






  
    
      
      value
    
  
  
    
      count
      51.000000
    
    
      mean
      6061543.352941
    
    
      std
      6838088.582209
    
    
      min
      562803.000000
    
    
      25%
      1697554.500000
    
    
      50%
      4340167.000000
    
    
      75%
      6649654.500000
    
    
      max
      37325068.000000



In [31]:

    
population_df.columns









    Out[31]:





Index([u'region', u'value'], dtype='object')

Merging stats data,demographic and population data



In [32]:

    
#we are now merging demographic and population data into one single data frame
#to do this , we are using pandas merge method which allows us to join two data frames like how we do it in sql
# we are using inner join and on the column region



In [33]:

    
population_demographic_merge_df = pd.merge(demographics_df,population_df,how='inner',on='region')



In [34]:

    
population_demographic_merge_df.head()









    Out[34]:






  
    
      
      region
      total_population
      percent_white
      percent_black
      percent_asian
      percent_hispanic
      per_capita_income
      median_rent
      median_age
      value
    
  
  
    
      0
      alabama
      4799277
      67
      26
      1
      4
      23680
      501
      38.1
      4777326
    
    
      1
      alaska
      720316
      63
      3
      5
      6
      32651
      978
      33.6
      711139
    
    
      2
      arizona
      6479703
      57
      4
      3
      30
      25358
      747
      36.3
      6410979
    
    
      3
      arkansas
      2933369
      74
      15
      1
      7
      22170
      480
      37.5
      2916372
    
    
      4
      california
      37659181
      40
      6
      13
      38
      29527
      1119
      35.4
      37325068



In [35]:

    
population_demographic_merge_df.shape









    Out[35]:





(51, 10)



In [36]:

    
population_demographic_merge_df = population_demographic_merge_df.rename(columns={'region':'State'})



In [37]:

    
population_demographic_merge_df.head()









    Out[37]:






  
    
      
      State
      total_population
      percent_white
      percent_black
      percent_asian
      percent_hispanic
      per_capita_income
      median_rent
      median_age
      value
    
  
  
    
      0
      alabama
      4799277
      67
      26
      1
      4
      23680
      501
      38.1
      4777326
    
    
      1
      alaska
      720316
      63
      3
      5
      6
      32651
      978
      33.6
      711139
    
    
      2
      arizona
      6479703
      57
      4
      3
      30
      25358
      747
      36.3
      6410979
    
    
      3
      arkansas
      2933369
      74
      15
      1
      7
      22170
      480
      37.5
      2916372
    
    
      4
      california
      37659181
      40
      6
      13
      38
      29527
      1119
      35.4
      37325068



In [38]:

    
statistics_population_demographic_merge_df = pd.merge(population_demographic_merge_df,statistics_df,how='inner',on='State')



In [39]:

    
statistics_population_demographic_merge_df.head()









    Out[39]:






  
    
      
      State
      total_population
      percent_white
      percent_black
      percent_asian
      percent_hispanic
      per_capita_income
      median_rent
      median_age
      value
      ...
      HighQ_Covar
      HighQ_Var
      HighQ_Mean
      HighQ_Median
      LowQ_Std
      LowQ_Covar
      LowQ_Median
      LowQ_Mode
      LowQ_Mean
      LowQ_Var
    
  
  
  

0 rows × 22 columns



In [40]:

    
#the reason why there are no entries here is because in the population demographic df the States are in lower case
# let us now change the States to lower case in statistics df too



In [41]:

    
statistics_df.head()









    Out[41]:






  
    
      
      State
      HighQ_Std
      HighQ_Mode
      HighQ_Covar
      HighQ_Var
      HighQ_Mean
      HighQ_Median
      LowQ_Std
      LowQ_Covar
      LowQ_Median
      LowQ_Mode
      LowQ_Mean
      LowQ_Var
    
  
  
    
      0
      Alabama
      1.539724
      ([337.32], [10.0])
      2.370749
      2.370749
      339.561849
      340.10
      2.073186
      4.298099
      144.98
      ([144.98], [216.0])
      145.978508
      4.298099
    
    
      1
      Alaska
      5.580105
      ([285.65], [19.0])
      31.137567
      31.137567
      291.482004
      289.81
      13.188305
      173.931381
      403.33
      ([403.33], [237.0])
      394.653964
      173.931381
    
    
      2
      Arizona
      2.015750
      ([300.37], [10.0])
      4.063247
      4.063247
      300.667483
      300.46
      3.961371
      15.692461
      185.78
      ([185.78], [213.0])
      188.500134
      15.692461
    
    
      3
      Arkansas
      6.866002
      ([349.29], [11.0])
      47.141986
      47.141986
      348.056147
      348.21
      1.172681
      1.375181
      126.10
      ([126.1], [219.0])
      126.771269
      1.375181
    
    
      4
      California
      1.727046
      ([245.03], [8.0])
      2.982686
      2.982686
      245.376125
      245.31
      1.598252
      2.554409
      188.60
      ([188.6], [211.0])
      189.783586
      2.554409



In [42]:

    
statistics_df['State'] = statistics_df['State'].str.lower()
statistics_df.head()









    Out[42]:






  
    
      
      State
      HighQ_Std
      HighQ_Mode
      HighQ_Covar
      HighQ_Var
      HighQ_Mean
      HighQ_Median
      LowQ_Std
      LowQ_Covar
      LowQ_Median
      LowQ_Mode
      LowQ_Mean
      LowQ_Var
    
  
  
    
      0
      alabama
      1.539724
      ([337.32], [10.0])
      2.370749
      2.370749
      339.561849
      340.10
      2.073186
      4.298099
      144.98
      ([144.98], [216.0])
      145.978508
      4.298099
    
    
      1
      alaska
      5.580105
      ([285.65], [19.0])
      31.137567
      31.137567
      291.482004
      289.81
      13.188305
      173.931381
      403.33
      ([403.33], [237.0])
      394.653964
      173.931381
    
    
      2
      arizona
      2.015750
      ([300.37], [10.0])
      4.063247
      4.063247
      300.667483
      300.46
      3.961371
      15.692461
      185.78
      ([185.78], [213.0])
      188.500134
      15.692461
    
    
      3
      arkansas
      6.866002
      ([349.29], [11.0])
      47.141986
      47.141986
      348.056147
      348.21
      1.172681
      1.375181
      126.10
      ([126.1], [219.0])
      126.771269
      1.375181
    
    
      4
      california
      1.727046
      ([245.03], [8.0])
      2.982686
      2.982686
      245.376125
      245.31
      1.598252
      2.554409
      188.60
      ([188.6], [211.0])
      189.783586
      2.554409



In [43]:

    
#now lets merge the data again, the reason we are merging all this data is we are able to look at one state on multiple levels.



In [44]:

    
stats_pop_dem_merge_df = pd.merge(population_demographic_merge_df,statistics_df,how='inner',on='State')



In [45]:

    
stats_pop_dem_merge_df.head()









    Out[45]:






  
    
      
      State
      total_population
      percent_white
      percent_black
      percent_asian
      percent_hispanic
      per_capita_income
      median_rent
      median_age
      value
      ...
      HighQ_Covar
      HighQ_Var
      HighQ_Mean
      HighQ_Median
      LowQ_Std
      LowQ_Covar
      LowQ_Median
      LowQ_Mode
      LowQ_Mean
      LowQ_Var
    
  
  
    
      0
      alabama
      4799277
      67
      26
      1
      4
      23680
      501
      38.1
      4777326
      ...
      2.370749
      2.370749
      339.561849
      340.10
      2.073186
      4.298099
      144.98
      ([144.98], [216.0])
      145.978508
      4.298099
    
    
      1
      alaska
      720316
      63
      3
      5
      6
      32651
      978
      33.6
      711139
      ...
      31.137567
      31.137567
      291.482004
      289.81
      13.188305
      173.931381
      403.33
      ([403.33], [237.0])
      394.653964
      173.931381
    
    
      2
      arizona
      6479703
      57
      4
      3
      30
      25358
      747
      36.3
      6410979
      ...
      4.063247
      4.063247
      300.667483
      300.46
      3.961371
      15.692461
      185.78
      ([185.78], [213.0])
      188.500134
      15.692461
    
    
      3
      arkansas
      2933369
      74
      15
      1
      7
      22170
      480
      37.5
      2916372
      ...
      47.141986
      47.141986
      348.056147
      348.21
      1.172681
      1.375181
      126.10
      ([126.1], [219.0])
      126.771269
      1.375181
    
    
      4
      california
      37659181
      40
      6
      13
      38
      29527
      1119
      35.4
      37325068
      ...
      2.982686
      2.982686
      245.376125
      245.31
      1.598252
      2.554409
      188.60
      ([188.6], [211.0])
      189.783586
      2.554409
    
  

5 rows × 22 columns



In [46]:

    
stats_pop_dem_merge_df.shape
stats_pop_dem_merge_df.columns









    Out[46]:





Index([u'State', u'total_population', u'percent_white', u'percent_black',
       u'percent_asian', u'percent_hispanic', u'per_capita_income',
       u'median_rent', u'median_age', u'value', u'HighQ_Std', u'HighQ_Mode',
       u'HighQ_Covar', u'HighQ_Var', u'HighQ_Mean', u'HighQ_Median',
       u'LowQ_Std', u'LowQ_Covar', u'LowQ_Median', u'LowQ_Mode', u'LowQ_Mean',
       u'LowQ_Var'],
      dtype='object')

Generating insights using merged data



In [47]:

    
#so now we have data for 51 states with 20 feature columns. 
#The data is ready



In [48]:

    
white_greater_black = stats_pop_dem_merge_df[stats_pop_dem_merge_df.percent_white > stats_pop_dem_merge_df.percent_black]
black_greate_white =  stats_pop_dem_merge_df[stats_pop_dem_merge_df.percent_white < stats_pop_dem_merge_df.percent_black]



In [49]:

    
white_greater_black.shape,black_greate_white.shape









    Out[49]:





((50, 22), (1, 22))



In [50]:

    
#given that the white population is high in almost every state, lets not do any comparison use the 
#percent of whites and blacks



In [51]:

    
#let us see if there is a correlation between per capita income and highQ_mean/lowQ_mean
correlation_btw_per_capita_highQ = stats.pearsonr(white_greater_black.per_capita_income,white_greater_black.HighQ_Mean)[0]
correlation_btw_per_capita_lowQ = stats.pearsonr(white_greater_black.per_capita_income,white_greater_black.LowQ_Mean)[0]



In [52]:

    
correlation_btw_per_capita_highQ,correlation_btw_per_capita_lowQ









    Out[52]:





(0.10372726217632158, 0.34288507101733851)



In [53]:

    
# using population data let us see if we can correlate total population with that of price
population_correlation = stats.pearsonr(stats_pop_dem_merge_df.total_population,stats_pop_dem_merge_df.HighQ_Mean)[0]
population_correlation

# we see that it is iversely correlated to an extent which implies greater the population lesser is the price









    Out[53]:





-0.17012234340734669



In [54]:

    
# let us see if there is a big difference in the value of High quality 
# weed prices based on states selling it legally and those that dont
legal_states = ['colorado']
illegal_states = ['wyoming']

legal_states_mean = stats_pop_dem_merge_df[stats_pop_dem_merge_df['State'].isin(legal_states)]['HighQ_Mean']
illegal_states_mean = stats_pop_dem_merge_df[stats_pop_dem_merge_df['State'].isin(illegal_states)]['HighQ_Mean']
legal_states_mean = legal_states_mean.reset_index()
illegal_states_mean = illegal_states_mean.reset_index()
print legal_states_mean.head()
print illegal_states_mean.head()

# we see that legal states sell highQ marijuana at a lesser price









    



   index  HighQ_Mean
0      5  238.918708
   index  HighQ_Mean
0     50   339.02078



In [55]:

    
# from this we understand that the value of low quality weed is somewhat affected by per capita income compared to 
# high quality weed
# we can use the same way to compute correlations amognst other attributes too



In [56]:

    
#before we go any further , lets try to find some interesting growth in weed prices for each state 
csv_df_sort_fillna_ffil.head()



In [57]:

    
country_mean_highQ = csv_df_sort_fillna_ffil.groupby(['date'],as_index=False).aggregate({'HighQ':np.mean})



In [58]:

    
country_mean_highQ.head()



In [59]:

    
country_mean_highQ.min()









    Out[59]:





date     2013-12-27 00:00:00
HighQ               322.1294
dtype: object



In [60]:

    
#we just have dates, we need to generate calendar week for each date



In [61]:

    
from datetime import datetime
# date format in %Y-%m-%d
def get_calendar_week(date):
    calendar_week = date.isocalendar()[1]
    return int(calendar_week)



In [ ]:



In [ ]:



In [62]:

    
# lets add calendar week to the data frame
csv_df_sort_fillna_ffil['Calendar_Week'] = csv_df_sort_fillna_ffil['date'].apply(get_calendar_week)



In [63]:

    
#lets check if it looks fine
csv_df_sort_fillna_ffil.head()









    Out[63]:






  
    
      
      State
      HighQ
      HighQN
      MedQ
      MedQN
      LowQ
      LowQN
      date
      Calendar_Week
    
  
  
    
      20094
      Alabama
      339.65
      1033
      198.04
      926
      147.15
      122
      2013-12-27
      52
    
    
      20859
      Alabama
      339.65
      1033
      198.04
      926
      147.15
      122
      2013-12-28
      52
    
    
      21573
      Alabama
      339.75
      1036
      198.26
      929
      149.49
      123
      2013-12-29
      52
    
    
      22287
      Alabama
      339.75
      1036
      198.81
      930
      149.49
      123
      2013-12-30
      1
    
    
      22797
      Alabama
      339.42
      1040
      198.68
      932
      149.49
      123
      2013-12-31
      1



In [64]:

    
#now let us try to group at week level and see during which week it was the lowest and for a given state
country_week_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_week_highQ_df.columns = country_week_highQ_df.columns.droplevel(0)
country_week_highQ_df = country_week_highQ_df.rename(columns={'':'Calendar_Week'})
country_week_highQ_df.head()
print country_week_highQ_df.min(),country_week_highQ_df.max()









    



Calendar_Week      1.000000
HighQ_Mean       326.614174
dtype: float64 Calendar_Week     52.000000
HighQ_Mean       333.778207
dtype: float64



In [65]:

    
#now let us check on which day it was the lowest across country
country_day_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['date'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_day_highQ_df.columns = country_day_highQ_df.columns.droplevel(0)
country_day_highQ_df = country_day_highQ_df.rename(columns={'':'date'})
country_day_highQ_df.head()
print country_day_highQ_df.min(),country_day_highQ_df.max()









    



date          2013-12-27 00:00:00
HighQ_Mean               322.1294
dtype: object date          2015-06-11 00:00:00
HighQ_Mean               335.5829
dtype: object



In [66]:

    
# now let us check when people buy the most across the country 
country_week_highQN_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week'],as_index=False).aggregate({'HighQN':{'HighQN_Mean':np.mean}}))
country_week_highQN_df.columns = country_week_highQN_df.columns.droplevel(0)
country_week_highQN_df = country_week_highQN_df.rename(columns={'':'Calendar_Week'})
country_week_highQN_df.head()
print country_week_highQN_df.min(),country_week_highQN_df.max()









    



Calendar_Week       1.000000
HighQN_Mean      1951.098039
dtype: float64 Calendar_Week      52.000000
HighQN_Mean      2522.322129
dtype: float64



In [67]:

    
# The above set of statements work across the country based on date and calendar week
# Now let us look at what happens every month by generating calendar week for a month and then see during which parts 
# of the month the price is at its highest/lowest



In [68]:

    
def get_calendar_week_month(date):
    calendar_week_year = get_calendar_week(date)
    month = date.month
    year = date.year
    day = date.day
    start_of_month = str(year)+"-"+str(month)+"-"+"01"
    calendar_week_start_of_month = date.isocalendar()[1]
    # this happens only for dec
    if(calendar_week_year > calendar_week_start_of_month):
        return calendar_week_year - calendar_week_start_of_month + 1
    else:
        return calendar_week_year



In [69]:

    
# the function to generate calendar_Week_month works
get_calendar_week_month(datetime.strptime("2013-12-30","%Y-%m-%d"))









    Out[69]:





1



In [70]:

    
csv_df_sort_fillna_ffil['Calendar_Week_Month'] = csv_df_sort_fillna_ffil['date'].apply(get_calendar_week_month)



In [71]:

    
csv_df_sort_fillna_ffil.head()









    Out[71]:






  
    
      
      State
      HighQ
      HighQN
      MedQ
      MedQN
      LowQ
      LowQN
      date
      Calendar_Week
      Calendar_Week_Month
    
  
  
    
      20094
      Alabama
      339.65
      1033
      198.04
      926
      147.15
      122
      2013-12-27
      52
      52
    
    
      20859
      Alabama
      339.65
      1033
      198.04
      926
      147.15
      122
      2013-12-28
      52
      52
    
    
      21573
      Alabama
      339.75
      1036
      198.26
      929
      149.49
      123
      2013-12-29
      52
      52
    
    
      22287
      Alabama
      339.75
      1036
      198.81
      930
      149.49
      123
      2013-12-30
      1
      1
    
    
      22797
      Alabama
      339.42
      1040
      198.68
      932
      149.49
      123
      2013-12-31
      1
      1



In [72]:

    
#now let us try to group at week level and see during which week it was the lowest and for a given state
country_month_week_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week_Month'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_month_week_highQ_df.columns = country_month_week_highQ_df.columns.droplevel(0)
country_month_week_highQ_df = country_month_week_highQ_df.rename(columns={'':'Calendar_Week_Month'})
country_month_week_highQ_df.head()
print  country_month_week_highQ_df.min()
print country_month_week_highQ_df.max()









    



Calendar_Week_Month      1.000000
HighQ_Mean             326.614174
dtype: float64
Calendar_Week_Month     52.000000
HighQ_Mean             333.778207
dtype: float64



In [73]:

    
csv_df_sort_fillna_ffil.columns









    Out[73]:





Index([u'State', u'HighQ', u'HighQN', u'MedQ', u'MedQN', u'LowQ', u'LowQN',
       u'date', u'Calendar_Week', u'Calendar_Week_Month'],
      dtype='object')

Using linear regression to predict values



In [74]:

    
#now we have some insights into the data , let us take the next step and use a liner regression model to predict the 
# prices of marijuana for the state alabama
columns_required = ['State','HighQ','date']
alabama_state = csv_df_sort_fillna_ffil[csv_df_sort_fillna_ffil['State'] == "Alabama"]
alabama_state_price_date = alabama_state.ix[:,columns_required]
alabama_state_price_date.dtypes









    Out[74]:





State            object
HighQ           float64
date     datetime64[ns]
dtype: object



In [75]:

    
train,test = train_test_split(alabama_state_price_date,test_size = 0.2)



In [76]:

    
def changeDateToOrdinal(date):
    return date.toordinal()



In [77]:

    
train_pd = pd.DataFrame(train,columns=['State','Price','Date'])
train_pd['State'] = 1
train_pd['Price'] = train_pd['Price'].astype(float)
train_pd['Date_ord'] = train_pd['Date'].apply(changeDateToOrdinal)

train_pd.head()









    Out[77]:






  
    
      
      State
      Price
      Date
      Date_ord
    
  
  
    
      0
      1
      338.61
      2014-10-26 00:00:00
      735532
    
    
      1
      1
      339.96
      2014-01-19 00:00:00
      735252
    
    
      2
      1
      340.32
      2014-08-08 00:00:00
      735453
    
    
      3
      1
      339.13
      2014-11-16 00:00:00
      735553
    
    
      4
      1
      341.17
      2014-03-24 00:00:00
      735316



In [78]:

    
train_pd.head()









    Out[78]:






  
    
      
      State
      Price
      Date
      Date_ord
    
  
  
    
      0
      1
      338.61
      2014-10-26 00:00:00
      735532
    
    
      1
      1
      339.96
      2014-01-19 00:00:00
      735252
    
    
      2
      1
      340.32
      2014-08-08 00:00:00
      735453
    
    
      3
      1
      339.13
      2014-11-16 00:00:00
      735553
    
    
      4
      1
      341.17
      2014-03-24 00:00:00
      735316



In [79]:

    
test_pd = pd.DataFrame(test,columns=['State','Price','Date'])
test_pd['State'] =1
test_pd['Price'] = test_pd['Price'].astype(float)
test_pd['Date_ord'] = test_pd['Date'].apply(changeDateToOrdinal)



In [80]:

    
print train_pd.head()
train_pd.dtypes

print test_pd.head()
test_pd.dtypes









    



   State   Price                 Date  Date_ord
0      1  338.61  2014-10-26 00:00:00    735532
1      1  339.96  2014-01-19 00:00:00    735252
2      1  340.32  2014-08-08 00:00:00    735453
3      1  339.13  2014-11-16 00:00:00    735553
4      1  341.17  2014-03-24 00:00:00    735316
   State   Price                 Date  Date_ord
0      1  340.63  2014-06-04 00:00:00    735388
1      1  340.52  2014-06-01 00:00:00    735385
2      1  337.32  2015-06-10 00:00:00    735759
3      1  339.92  2014-02-14 00:00:00    735278
4      1  340.67  2014-09-08 00:00:00    735484






    Out[80]:





State         int64
Price       float64
Date         object
Date_ord      int64
dtype: object



In [81]:

    
x_train = train_pd.ix[:,['State','Date_ord']]
y_train = train_pd.ix[:,['Price']]
print x_train.dtypes,y_train.dtypes
x_test = test_pd.ix[:,['State','Date_ord']]
y_test = test_pd.ix[:,['Price']]
x_train.head()









    



State       int64
Date_ord    int64
dtype: object Price    float64
dtype: object






    Out[81]:






  
    
      
      State
      Date_ord
    
  
  
    
      0
      1
      735532
    
    
      1
      1
      735252
    
    
      2
      1
      735453
    
    
      3
      1
      735553
    
    
      4
      1
      735316



In [82]:

    
y_train.head()
y_train.dtypes









    Out[82]:





Price    float64
dtype: object



In [83]:

    
print x_train.head()
x_train.dtypes









    



   State  Date_ord
0      1    735532
1      1    735252
2      1    735453
3      1    735553
4      1    735316






    Out[83]:





State       int64
Date_ord    int64
dtype: object



In [84]:

    
ols = linear_model.LinearRegression(normalize=True, fit_intercept=True)
ols.fit(x_train, y_train, n_jobs=-1)
 
print ols.coef_
ols_predict = ols.predict(x_test)
 
#print ols_predict
#Setting 0.5 as the decision boundary. If values are above 0.5, the prediction is set to 1. This means the record belongs to class 2. Else, the prediction is set to 0
 
test_pd['Predicted_Price'] = ols_predict









    



[[ 0.        -0.0078273]]



In [85]:

    
test_pd.head()









    Out[85]:






  
    
      
      State
      Price
      Date
      Date_ord
      Predicted_Price
    
  
  
    
      0
      1
      340.63
      2014-06-04 00:00:00
      735388
      340.103428
    
    
      1
      1
      340.52
      2014-06-01 00:00:00
      735385
      340.126909
    
    
      2
      1
      337.32
      2015-06-10 00:00:00
      735759
      337.199499
    
    
      3
      1
      339.92
      2014-02-14 00:00:00
      735278
      340.964431
    
    
      4
      1
      340.67
      2014-09-08 00:00:00
      735484
      339.352007



In [ ]:

	State	HighQ	HighQN	MedQ	MedQN	LowQ	LowQN	date
20094	Alabama	339.65	1033	198.04	926	147.15	122	2013-12-27
20859	Alabama	339.65	1033	198.04	926	147.15	122	2013-12-28
21573	Alabama	339.75	1036	198.26	929	149.49	123	2013-12-29
22287	Alabama	339.75	1036	198.81	930	149.49	123	2013-12-30
22797	Alabama	339.42	1040	198.68	932	149.49	123	2013-12-31

	State	HighQ_Std	HighQ_Mode	HighQ_Covar	HighQ_Var	HighQ_Mean	HighQ_Median	LowQ_Std	LowQ_Covar	LowQ_Median	LowQ_Mode	LowQ_Mean	LowQ_Var
0	Alabama	1.539724	([337.32], [10.0])	2.370749	2.370749	339.561849	340.10	2.073186	4.298099	144.98	([144.98], [216.0])	145.978508	4.298099
1	Alaska	5.580105	([285.65], [19.0])	31.137567	31.137567	291.482004	289.81	13.188305	173.931381	403.33	([403.33], [237.0])	394.653964	173.931381
2	Arizona	2.015750	([300.37], [10.0])	4.063247	4.063247	300.667483	300.46	3.961371	15.692461	185.78	([185.78], [213.0])	188.500134	15.692461
3	Arkansas	6.866002	([349.29], [11.0])	47.141986	47.141986	348.056147	348.21	1.172681	1.375181	126.10	([126.1], [219.0])	126.771269	1.375181
4	California	1.727046	([245.03], [8.0])	2.982686	2.982686	245.376125	245.31	1.598252	2.554409	188.60	([188.6], [211.0])	189.783586	2.554409

	region	total_population	percent_white	percent_black	percent_asian	percent_hispanic	per_capita_income	median_rent	median_age
0	alabama	4799277	67	26	1	4	23680	501	38.1
1	alaska	720316	63	3	5	6	32651	978	33.6
2	arizona	6479703	57	4	3	30	25358	747	36.3
3	arkansas	2933369	74	15	1	7	22170	480	37.5
4	california	37659181	40	6	13	38	29527	1119	35.4

	region	value
0	alabama	4777326
1	alaska	711139
2	arizona	6410979
3	arkansas	2916372
4	california	37325068

	value
count	51.000000
mean	6061543.352941
std	6838088.582209
min	562803.000000
25%	1697554.500000
50%	4340167.000000
75%	6649654.500000
max	37325068.000000

	date	HighQ
0	2013-12-27	335.498039
1	2013-12-28	335.494902
2	2013-12-29	335.582941
3	2013-12-30	335.490196
4	2013-12-31	335.505098

	State	Price	Date	Date_ord
0	1	338.61	2014-10-26 00:00:00	735532
1	1	339.96	2014-01-19 00:00:00	735252
2	1	340.32	2014-08-08 00:00:00	735453
3	1	339.13	2014-11-16 00:00:00	735553
4	1	341.17	2014-03-24 00:00:00	735316

	State	Price	Date	Date_ord	Predicted_Price
0	1	340.63	2014-06-04 00:00:00	735388	340.103428
1	1	340.52	2014-06-01 00:00:00	735385	340.126909
2	1	337.32	2015-06-10 00:00:00	735759	337.199499
3	1	339.92	2014-02-14 00:00:00	735278	340.964431
4	1	340.67	2014-09-08 00:00:00	735484	339.352007